Introduction

This R Markdown file is meant to streamline the analysis of the bivalency data and keep it organized in one place. Different types of analyses will be in different sections with explanations as to why those are important. This section will contain only data loading and functions that are necessary for downstream analyses.

Data loading

This chunk loads in both the HMD data and the bulk RNA-seq data, as well as the tidyverse and sleuth libraries. NEW: It also conducts signal correction for H3K9me3 on the trans bivalency dataset.

library(topGO)
library(GO.db)
library(biomaRt)
library(tidyverse)
library(sleuth)

load("rdata/data_loaded.RData")
load("rdata/sleuth_loaded_bivalency_bulk-rna-seq.RData")

#Redo the HMDs for 0-400bp about the TSS

naive_hmd = data.frame(Gene = naive$k4$Gene)
naive_hmd$k4 = naive$k4 %>% select(X0:X400) %>% rowMeans
naive_hmd$k9 = naive$k9 %>% select(X0:X400) %>% rowMeans
naive_hmd$k27 = naive$k27 %>% select(X0:X400) %>% rowMeans
naive_hmd$cis = naive$cis %>% select(X0:X400) %>% rowMeans
naive_hmd$trans = naive$trans %>% select(X0:X400) %>% rowMeans
naive_hmd$Gene = as.character(naive_hmd$Gene)

primed_hmd = data.frame(Gene = primed$k4$Gene)
primed_hmd$k4 = primed$k4 %>% select(X0:X400) %>% rowMeans
primed_hmd$k9 = primed$k9 %>% select(X0:X400) %>% rowMeans
primed_hmd$k27 = primed$k27 %>% select(X0:X400) %>% rowMeans
primed_hmd$cis = primed$cis %>% select(X0:X400) %>% rowMeans
primed_hmd$trans = primed$trans %>% select(X0:X400) %>% rowMeans
primed_hmd$Gene = as.character(primed_hmd$Gene)

npc_hmd = data.frame(Gene = npc$k4$Gene)
npc_hmd$k4 = npc$k4 %>% select(X0:X400) %>% rowMeans
npc_hmd$k9 = npc$k9 %>% select(X0:X400) %>% rowMeans
npc_hmd$k27 = npc$k27 %>% select(X0:X400) %>% rowMeans
npc_hmd$cis = npc$cis %>% select(X0:X400) %>% rowMeans
npc_hmd$trans = npc$trans %>% select(X0:X400) %>% rowMeans
npc_hmd$Gene = as.character(npc_hmd$Gene)

hmds = list()
hmds[["naive"]] = naive_hmd
hmds[["primed"]] = primed_hmd
hmds[["npc"]] = npc_hmd

rm(naive_hmd, primed_hmd, npc_hmd)

tss_range = seq(-3, 3, by = 0.05)

sleuth_objs = list()
sleuth_objs[["all"]] = so_all
sleuth_objs[["naive_npc"]] = so_naive_npc
sleuth_objs[["naive_primed"]] = so_naive_primed
sleuth_objs[["primed_npc"]] = so_primed_npc

rm(so_all, so_naive_npc, so_naive_primed, so_primed_npc)

rm(metadata_naive_npc, metadata_naive_primed, metadata_primed_npc)

specificities = read.table("off_target.txt", header = T, row.names = 1)

hmds$naive = hmds$naive %>% mutate(trans = (trans - specificities["naive",]$trans_ip_k9 * k9)/(1 - specificities["naive",]$trans_ip_k9*specificities["naive",]$k9_ip_trans))
hmds$primed = hmds$primed %>% mutate(trans = (trans - specificities["primed",]$trans_ip_k9 * k9)/(1 - specificities["primed",]$trans_ip_k9*specificities["primed",]$k9_ip_trans))
hmds$npc = hmds$npc %>% mutate(trans = (trans - specificities["npc",]$trans_ip_k9 * k9)/(1 - specificities["npc",]$trans_ip_k9*specificities["npc",]$k9_ip_trans))

rm(specificities)

The next chunk loads in functions to make metagenes.

get_biv_genes = function(dataset, threshold){
  genes = (dataset %>% filter(trans > threshold))$Gene
  return(genes)
}

metagene = function(dataset, range, genes){
  dataset_names = names(dataset)
  summ_df = data.frame(dist = range)
  
  for (i in dataset_names){
    summ_df[,i] = as.numeric(dataset[[i]] %>% filter(Gene %in% genes) %>% dplyr::select(-1) %>% summarize(across(.fns = mean)))
  }
  
  return(summ_df)
}

metagene_plot = function(summ_df, ylims = numeric(), title = NA, legend_overlap = F, title_overlap = T){
  summ_plt = summ_df %>% gather("mod", "hmd", -dist)
  plot_summary = ggplot(data = summ_plt, aes(x = dist, y = hmd, color = mod)) + geom_line(size = 1) + theme_classic() + scale_x_continuous(expand = c(0,0), name = "Dist. from TSS (kb)") + theme(text = element_text(family = "sans", color = "black"), title = element_text(size = 10, color = "black"), axis.text = element_text(size = 10, color = "black"), axis.title = element_text(size = 10, color = "black"))
  
  if (length(ylims)>0){
    plot_summary = plot_summary + scale_y_continuous(limits = ylims, expand = c(0,0), name = "HMD (%)")
  } else {
    plot_summary = plot_summary + scale_y_continuous(expand = c(0,0), name = "HMD (%)")
  }
  
  if (!is.na(title) & title_overlap){ plot_summary = plot_summary + ggtitle(title) }
  
  if(legend_overlap){ plot_summary = plot_summary + theme(legend.position = 'none')}
  
  print(plot_summary)
}

First-Pass HMD Metagene Analysis

First, we have to have a sense of how bivalency exists at all genes. To do this, we will plot metagenes of all genes for all samples.

metagene_plot(metagene(naive, tss_range, consensusGenes), ylims = c(0,80), title = "Naive mESCs")

metagene_plot(metagene(primed, tss_range, consensusGenes), ylims = c(0,80), title = "Primed mESCs")

metagene_plot(metagene(npc, tss_range, consensusGenes), ylims = c(0,80), title = "NPCs")

We observe that there is a significant amount of bivalency (especially trans bivalency) at all genes from naive, primed, and NPC samples. Importantly, it seems that bivalency actually increases over time rather than decreasing. This is interesting. My next step is to identify the average amount of bivalency in each of these samples – and that of all other modifications, while I’m at it.

hmd_summary_precursor = hmds[["naive"]] %>% mutate(sample = "naive") %>% select(sample, k4:trans)
hmd_summary_precursor = rbind(hmd_summary_precursor, (hmds[["primed"]] %>% mutate(sample = "primed") %>% select(sample, k4:trans)))
hmd_summary_precursor = rbind(hmd_summary_precursor, (hmds[["npc"]] %>% mutate(sample = "npc") %>% select(sample, k4:trans)))

hmd_summary_all_genes = hmd_summary_precursor %>% group_by(sample) %>% summarize(across(.fns = mean))
print("HMD Summary Mean")
## [1] "HMD Summary Mean"
print(hmd_summary_all_genes)
## # A tibble: 3 x 6
##   sample    k4    k9   k27   cis trans
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 naive   28.7  9.44  27.3  6.51  20.6
## 2 npc     30.5 18.1   21.1  4.49  38.6
## 3 primed  20.1  9.35  61.3  6.91  51.2
hmd_summary_all_genes = hmd_summary_precursor %>% group_by(sample) %>% summarize(across(.fns = median))
print("HMD Summary Median")
## [1] "HMD Summary Median"
print(hmd_summary_all_genes)
## # A tibble: 3 x 6
##   sample    k4    k9   k27   cis trans
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 naive   24.7  8.38  20.0  5.09  15.7
## 2 npc     21.5 15.9   16.0  3.14  25.2
## 3 primed  18.0  8.17  53.6  5.39  39.0
rm(hmd_summary_precursor, hmd_summary_all_genes)

Both by mean and median, we see that bivalency again increases from naive to primed to npc, with median trans HMDs of approximately 20%, 40%, and 30%, respectively. This means it will be somewhat difficult to identify genes that are truly “bivalent,” so we’re goign to go with initial thresholds of 10%, 25%, and 50%. We will load these all into lists for the sake of ease just to make it easier to handle them in the future.

bivalent_genes_10 = list()
bivalent_genes_25 = list()
bivalent_genes_50 = list()

bivalent_genes_10[["naive"]] = get_biv_genes(hmds[["naive"]], 10)
bivalent_genes_25[["naive"]] = get_biv_genes(hmds[["naive"]], 25)
bivalent_genes_50[["naive"]] = get_biv_genes(hmds[["naive"]], 50)

bivalent_genes_10[["primed"]] = get_biv_genes(hmds[["primed"]], 10)
bivalent_genes_25[["primed"]] = get_biv_genes(hmds[["primed"]], 25)
bivalent_genes_50[["primed"]] = get_biv_genes(hmds[["primed"]], 50)

bivalent_genes_10[["npc"]] = get_biv_genes(hmds[["npc"]], 10)
bivalent_genes_25[["npc"]] = get_biv_genes(hmds[["npc"]], 25)
bivalent_genes_50[["npc"]] = get_biv_genes(hmds[["npc"]], 50)

As expected, the proportion of genes with the listed amount of bivalency decreases as we increase the threshold from 10% to 25% to 50%. We now wish to watch the bivalent genes by each threshold in naive cells over the course of differentiation.

metagene_plot(metagene(naive, tss_range, bivalent_genes_10[["naive"]]), title = "Naive cells at genes with trans bivalent >10% in naive")

metagene_plot(metagene(primed, tss_range, bivalent_genes_10[["naive"]]), title = "Primed cells at genes with trans bivalent >10% in naive")

metagene_plot(metagene(npc, tss_range, bivalent_genes_10[["naive"]]), title = "NPC cells at genes with trans bivalent >10% in naive")

metagene_plot(metagene(naive, tss_range, bivalent_genes_25[["naive"]]), title = "Naive cells at genes with trans bivalent >25% in naive")

metagene_plot(metagene(primed, tss_range, bivalent_genes_25[["naive"]]), title = "Primed cells at genes with trans bivalent >25% in naive")

metagene_plot(metagene(npc, tss_range, bivalent_genes_25[["naive"]]), title = "NPC cells at genes with trans bivalent >25% in naive")

metagene_plot(metagene(naive, tss_range, bivalent_genes_50[["naive"]]), title = "Naive cells at genes with trans bivalent >50% in naive")

metagene_plot(metagene(primed, tss_range, bivalent_genes_50[["naive"]]), title = "Primed cells at genes with trans bivalent >50% in naive")

metagene_plot(metagene(npc, tss_range, bivalent_genes_50[["naive"]]), title = "NPC cells at genes with trans bivalent >50% in naive")

Basically, no matter how we call it, bivalent genes seem to broadly stay bivalent.

Dominance sectioning

Let’s see if sectioning this by K27 dominant bivalency or K4 dominant bivalency in the naive dataset helps at all.

naive_dominance = list()
hmds[["naive"]] = hmds[["naive"]] %>% mutate(logratio = log(k27+1) - log(k4+1))
naive_dominance[["k27"]] = (hmds[["naive"]] %>% filter(trans > 25) %>% filter(logratio > 1))$Gene
naive_dominance[["k4"]] = (hmds[["naive"]] %>% filter(trans > 25) %>% filter(logratio < -1))$Gene
naive_dominance[["mid"]] = (hmds[["naive"]] %>% filter(trans > 25) %>% filter(logratio <= 1 & logratio >= -1))$Gene

Overall, we see that with an \(e^1\) ratio threshold (logratio 1), there are 1728 K27 dominant genes, 6335 K4 dominant genes, and 7704 neutral genes with a bivalency threshold of 25%. Of course, we now want to do metagene analysis.

metagene_plot(metagene(naive, tss_range, naive_dominance[["k27"]]), title = "Naive cells at genes with trans bivalent >25% and K27 dominance in naive")

metagene_plot(metagene(primed, tss_range, naive_dominance[["k27"]]), title = "Primed cells at genes with trans bivalent >25% and K27 dominance in naive")

metagene_plot(metagene(npc, tss_range, naive_dominance[["k27"]]), title = "NPC cells at genes with trans bivalent >25% and K27 dominance in naive")

metagene_plot(metagene(naive, tss_range, naive_dominance[["k4"]]), title = "Naive cells at genes with trans bivalent >25% and k4 dominance in naive")

metagene_plot(metagene(primed, tss_range, naive_dominance[["k4"]]), title = "Primed cells at genes with trans bivalent >25% and k4 dominance in naive")

metagene_plot(metagene(npc, tss_range, naive_dominance[["k4"]]), title = "NPC cells at genes with trans bivalent >25% and k4 dominance in naive")

metagene_plot(metagene(naive, tss_range, naive_dominance[["mid"]]), title = "Naive cells at genes with trans bivalent >25% and mid dominance in naive")

metagene_plot(metagene(primed, tss_range, naive_dominance[["mid"]]), title = "Primed cells at genes with trans bivalent >25% and mid dominance in naive")

metagene_plot(metagene(npc, tss_range, naive_dominance[["mid"]]), title = "NPC cells at genes with trans bivalent >25% and mid dominance in naive")

Overwhelmingly, it again seems that across these bivalency dominance classes, bivalent genes remain bivalent.

Gene Ontology Setup

My next goal will be to conduct metagene analysis on different gene ontologies. The goal here will be to load up the gene ontology files and accessions so that I don’t have to download it again in the future.

if (!file.exists("rdata/go_terms.RData")){
  ensembl = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl")
  attributes = biomaRt::listAttributes(ensembl)
  filters = biomaRt::listFilters(ensembl)
  
  go_refseq_mrna = biomaRt::getBM(attributes = c('refseq_mrna', 'ensembl_gene_id', 'external_gene_name', 'go_id'), filters = 'with_refseq_mrna', values = TRUE, mart = ensembl)
  
  go_refseq_ncrna = biomaRt::getBM(attributes = c('refseq_ncrna', 'ensembl_gene_id', 'external_gene_name', 'go_id'), filters = c('with_refseq_mrna', 'with_refseq_ncrna'), values = list(FALSE, TRUE), mart = ensembl)
  
  colnames(go_refseq_mrna) = c('refseq_id', 'ensembl_gene_id', 'external_gene_name', 'go_id')
  colnames(go_refseq_ncrna) = c('refseq_id', 'ensembl_gene_id', 'external_gene_name', 'go_id')
  refseq_ensembl_external_go = rbind(go_refseq_mrna, go_refseq_ncrna) %>% filter(refseq_id != "") %>% filter(go_id != "")
  
  refseq_with_go = refseq_ensembl_external_go$refseq_id %>% unique
  ensembl_with_go = refseq_ensembl_external_go$ensembl_gene_id %>% unique
  genename_with_go = (refseq_ensembl_external_go %>% filter(external_gene_name != ""))$external_gene_name %>% unique
  
  rm(ensembl, attributes, filters, go_refseq_mrna, go_refseq_ncrna)
  save.image("rdata/go_terms.RData")
} else {
  load("rdata/go_terms.RData")
}

We now have GO ids for Refseq, Ensembl, and external gene names in M. musculus. The next thing to do is to assemble lists of the GO terms for each class of genes that we will be interested in.

First, we will want to get GO terms and their offspring for developmental processes, embryo development, and maintenance of pluripotency. Then, we will examine the metabolic processes. As a negative control, so to speak, we will also pull out terms for immune system processes. Finally, we will get GO terms for neurodevelopmental processes, as these genes should be “turned on” over differentiation to NPCs.

go_ids = list()

go_ids[["developmental_processes"]] = c(GO.db::GOBPOFFSPRING[["GO:0032502"]], "GO:0032502")
go_ids[["embryo_development_ids"]] = c(GO.db::GOBPOFFSPRING[["GO:0009790"]], "GO:0009790")
go_ids[["maintenance_pluripotency"]] = c(GO.db::GOBPOFFSPRING[["GO:0019827"]], "GO:0019827")

go_ids[["metabolic_processes"]] = c(GO.db::GOBPOFFSPRING[["GO:0008152"]], "GO:0008152")

go_ids[["immune_system"]] = c(GO.db::GOBPOFFSPRING[["GO:0002376"]], "GO:0002376")

go_ids[["neuron_differentiation"]] = c(GO.db::GOBPOFFSPRING[["GO:0030182"]], "GO:0030182")

All told, we now have GO term IDs for each of our classes of interest stored in the list go_ids.

Sectioning Metagenes with Gene Ontology

We will now use the different gene ontology terms we have just identified to create metagene plots across different GO term classes.

for (go_term in names(go_ids))
{
  overlapgenes = (refseq_ensembl_external_go %>% filter(go_id %in% go_ids[[go_term]] & refseq_id %in% bivalent_genes_25[["naive"]]))$refseq_id
  
  count_genes = length(overlapgenes)
  
  metagene_plot(metagene(naive, tss_range, overlapgenes), title = paste0("Naive cells at naive 25% bivalent genes HMD in GO term ", go_term, " (", count_genes, " genes)"))
  metagene_plot(metagene(primed, tss_range, overlapgenes), title = paste0("Primed cells at naive 25% bivalent genes HMD in GO term ", go_term, " (", count_genes, " genes)"))
  metagene_plot(metagene(npc, tss_range, overlapgenes), title = paste0("NPC cells at naive 25% bivalent genes HMD in GO term ", go_term, " (", count_genes, " genes)"))
}

rm(count_genes)

Across the different GO term classes, bivalent genes remain bivalent.

RNA-seq First-Pass Analysis

We have three RNA-seq datasets, which we have already aligned against the Refseq cDNA for mm10 using Kallisto and loaded into Sleuth. These are contained in sleuth_objs.

First, we will look at the data by PCA.

plot_pca(sleuth_objs$all, color_by = "cell_type")

plot_pca(sleuth_objs$naive_npc, color_by = "rep")

plot_pca(sleuth_objs$naive_primed, color_by = "rep")

plot_pca(sleuth_objs$primed_npc, color_by = "rep")

PCA looks fine, samples separate as expected without huge batch effects except with primed to NPC. We’ll correct for replicate there but will otherwise just use a naive comparison.

First, we will compare naive to NPC, then naive to primed, then finally primed to NPC.

if (!file.exists("rdata/sleuth_analysis.RData")){
  sleuth_table_gene = list()

  sleuth_objs$naive_npc = sleuth_fit(sleuth_objs$naive_npc, ~1, 'naive')
  sleuth_objs$naive_npc = sleuth_fit(sleuth_objs$naive_npc, ~cell_type, 'cell_type')
  sleuth_objs$naive_npc = sleuth_lrt(sleuth_objs$naive_npc, 'naive', 'cell_type')
  
  sleuth_table_gene[["naive_npc"]] = sleuth_results(sleuth_objs$naive_npc, 'naive:cell_type', 'lrt', show_all = FALSE, pval_aggregate = FALSE) %>% mutate(refseq_id = str_replace(target_id, "([A-Za-z0-9_]*)\\.[0-9]*", "\\1")) 
  
  sleuth_objs$naive_primed = sleuth_fit(sleuth_objs$naive_primed, ~1, 'naive')
  sleuth_objs$naive_primed = sleuth_fit(sleuth_objs$naive_primed, ~cell_type, 'cell_type')
  sleuth_objs$naive_primed = sleuth_lrt(sleuth_objs$naive_primed, 'naive', 'cell_type')
  
  sleuth_table_gene[["naive_primed"]] = sleuth_results(sleuth_objs$naive_primed, 'naive:cell_type', 'lrt', show_all = FALSE, pval_aggregate = FALSE) %>% mutate(refseq_id = str_replace(target_id, "([A-Za-z0-9_]*)\\.[0-9]*", "\\1")) 
  
  sleuth_objs$primed_npc = sleuth_fit(sleuth_objs$primed_npc, ~1, 'naive')
  sleuth_objs$primed_npc = sleuth_fit(sleuth_objs$primed_npc, ~rep, 'rep')
  sleuth_objs$primed_npc = sleuth_fit(sleuth_objs$primed_npc, ~cell_type, 'cell_type')
  sleuth_objs$primed_npc = sleuth_fit(sleuth_objs$primed_npc, ~rep+cell_type, 'full')
  sleuth_objs$primed_npc = sleuth_lrt(sleuth_objs$primed_npc, 'naive', 'cell_type')
  sleuth_objs$primed_npc = sleuth_lrt(sleuth_objs$primed_npc, 'rep', 'full')
  
  sleuth_table_gene[["primed_npc"]] = sleuth_results(sleuth_objs$primed_npc, 'rep:full', 'lrt', show_all = FALSE, pval_aggregate = FALSE) %>% mutate(refseq_id = str_replace(target_id, "([A-Za-z0-9_]*)\\.[0-9]*", "\\1"))
  
  save.image("rdata/sleuth_analysis.RData")
} else{
  load("rdata/sleuth_analysis.RData")
}

We see that there are several thousand genes that are differentially expressed in all these conditions: 19450/51998 from naive to NPC, 17835/50800 from naive to primed, and 9908/50126 from primed to NPC.

We also want to be able to get our average gene expression for each gene.

gene_expression = sleuth_objs$all$obs_norm %>% separate(sample, c("cell_type", "rep"), sep = "_RNA_rep") %>% group_by(target_id, cell_type) %>% select(target_id, cell_type, tpm) %>% summarize(across(.fns = mean)) %>% ungroup %>% spread(key = cell_type, value = tpm)
  
gene_expression = gene_expression %>% data.frame %>% mutate(refseq_id = str_replace(target_id, "([A-Za-z0-9_]*)\\.[0-9]*", "\\1")) %>% select(target_id, refseq_id, naive = mESC_with2i, primed = mESC_no2i, npc = NPC)

We must also identify upregulated and downregulated genes

upregulated_genes = list()
downregulated_genes = list()
nonregulated_genes = list()

upregulated_genes$naive_npc = (sleuth_table_gene$naive_npc %>% filter(qval <= 0.05) %>% inner_join(gene_expression, by = "refseq_id") %>% filter(npc > naive))$refseq_id
downregulated_genes$naive_npc = (sleuth_table_gene$naive_npc %>% filter(qval <= 0.05) %>% inner_join(gene_expression, by = "refseq_id") %>% filter(npc < naive))$refseq_id
nonregulated_genes$naive_npc = (sleuth_table_gene$naive_npc %>% filter(qval > 0.05))$refseq_id

upregulated_genes$naive_primed = (sleuth_table_gene$naive_primed %>% filter(qval <= 0.05) %>% inner_join(gene_expression, by = "refseq_id") %>% filter(primed > naive))$refseq_id
downregulated_genes$naive_primed = (sleuth_table_gene$naive_primed %>% filter(qval <= 0.05) %>% inner_join(gene_expression, by = "refseq_id") %>% filter(primed < naive))$refseq_id
nonregulated_genes$naive_primed = (sleuth_table_gene$naive_primed %>% filter(qval > 0.05))$refseq_id

upregulated_genes$primed_npc = (sleuth_table_gene$primed_npc %>% filter(qval <= 0.05) %>% inner_join(gene_expression, by = "refseq_id") %>% filter(npc > primed))$refseq_id
downregulated_genes$primed_npc = (sleuth_table_gene$primed_npc %>% filter(qval <= 0.05) %>% inner_join(gene_expression, by = "refseq_id") %>% filter(npc < primed))$refseq_id
nonregulated_genes$primed_npc = (sleuth_table_gene$primed_npc %>% filter(qval > 0.05))$refseq_id

Integrating RNA-seq and HMD Metagene Analysis

Having the above gene expression matrix, I want to examine the way in which gene expression changes across these different datasets from naive to NPC.

Violin plots of gene expression

First thing to do is to get the gene expression of the genes overall. We’ll be doing this a lot, so we want to create a function to do it.

violin_plot = function(gene_expression_df, filter_set = consensusGenes, title_text, legend_overlap = T, title_overlap = T)
{
  violin_df = gene_expression_df %>% filter(refseq_id %in% filter_set) %>% select(refseq_id, Naive = naive, Primed = primed, NPC = npc) %>% mutate(across(.cols = Naive:NPC, .fns = ~log10(.+.0001))) %>% gather(key = "cell_type", value = "log10_tpm", -refseq_id)
violin_df$cell_type = factor(violin_df$cell_type, levels = c('Naive', 'Primed', 'NPC'), ordered = TRUE)
  
  violin_plt = ggplot(data = violin_df, aes(x = cell_type, y = log10_tpm, fill = cell_type)) + geom_violin() +  stat_summary(fun=mean, geom="point", shape=23, size=2, fill = "black") + theme_classic() + geom_boxplot(fill = "white", width = 0.1) + scale_fill_manual(values = c("firebrick2", "royalblue2", "green3")) + theme(text = element_text(family = "sans", color = "black"), title = element_text(size = 10, color = "black"), axis.text = element_text(size = 10, color = "black"), axis.title = element_text(size = 10, color = "black"), axis.title.x = element_blank()) + scale_y_continuous(name = "Log(TPM + 1e-4)", breaks = seq(-4, 6, 2))
  
  if (!legend_overlap){ violin_plt = violin_plt + theme(legend.position = 'none') }
  
  if (title_overlap){ violin_plt = violin_plt + labs(title = title_text) }
  
  print(violin_plt)
}
violin_plot(gene_expression, title_text = "Gene expression at all genes in naive mESCs")

Next, we plot gene expression of bivalent genes in naive cells.

violin_plot(gene_expression, filter_set = bivalent_genes_25$naive, title_text = "Gene expression at genes with >25% bivalency in naive mESCs")

We see only subtle changes at best. Let us now section by K27 or K4 dominance.

violin_plot(gene_expression, filter_set = naive_dominance$k27, title_text = "Gene expression at K27 dominant bivalent genes in naive mESCs")

violin_plot(gene_expression, filter_set = (hmds$naive %>% filter(logratio > 1 & trans < 25))$Gene, title_text = "Gene expression at K27 dominant NON-bivalent genes in naive mESCs")

violin_plot(gene_expression, filter_set = naive_dominance$k4, title_text = "Gene expression at K4 dominant bivalent genes in naive mESCs")

violin_plot(gene_expression, filter_set = (hmds$naive %>% filter(logratio < -1 & trans < 25))$Gene, title_text = "Gene expression at K4 dominant NON-bivalent genes in naive mESCs")

violin_plot(gene_expression, filter_set = naive_dominance$mid, title_text = "Gene expression at mid dominant bivalent genes in naive mESCs")

violin_plot(gene_expression, filter_set = (hmds$naive %>% filter(logratio < 1 & logratio > -1 & trans < 25))$Gene, title_text = "Gene expression at mid dominant NON-bivalent genes in naive mESCs")

This is actually somewhat interesting. Dividing based on non-bivalent vs. bivalent genes, we see that K27 dominant bivalent genes actually do increase in expression over differentiation, but K27 dominant non-bivalent genes do not.

The other thing that we can do here is to bin the naive mESC bivalent genes by gene expression, then track those subsets of genes across the differentiation pathway.

for (i in 1:5)
{
  gene_expression_bivalent = gene_expression %>% filter(refseq_id %in% bivalent_genes_25$naive)
  countup = floor(length(gene_expression_bivalent$naive)/5)
  gene_list_set = (gene_expression_bivalent %>% arrange(desc(naive)) %>%  slice((countup*(i-1)):(countup*i - 1)))$refseq_id
  
  violin_plot(gene_expression_bivalent, gene_list_set, paste0("Gene expression at bivalent genes in naive mESCs, naive expression quintile ", i))
}

rm(gene_expression_bivalent, countup, gene_list_set)

We will section more coarsely to cut off at above and below -3 for log10_tpm.

gene_expression_bivalent = gene_expression %>% filter(refseq_id %in% bivalent_genes_25$naive)
gene_list_set_highexp = (gene_expression_bivalent %>% select(refseq_id:npc) %>% mutate(across(.cols = naive:npc, .fns = ~log10(.+.0001))) %>% filter(naive>=-3))$refseq_id

violin_plot(gene_expression_bivalent, filter_set = gene_list_set_highexp, title_text = "Gene expression at bivalent genes in naive, log(TPM) > -3")

gene_list_set_lowexp = (gene_expression_bivalent %>% select(refseq_id:npc) %>% mutate(across(.cols = naive:npc, .fns = ~log10(.+.0001))) %>% filter(naive < -3))$refseq_id

violin_plot(gene_expression_bivalent, filter_set = gene_list_set_lowexp, title_text = "Gene expression at bivalent genes in naive, log(TPM) < -3")

rm(gene_expression_bivalent, gene_list_set_lowexp, gene_list_set_highexp)

I don’t really know what to say, it looks basically the same there.

To check if the quintiles spread is real effect or just due to chance, I will do the same thing, but working off quintiles for npc instead.

for (i in 1:5)
{
  gene_expression_bivalent = gene_expression %>% filter(refseq_id %in% bivalent_genes_25$naive)
  countup = floor(length(gene_expression_bivalent$naive)/5)
  gene_list_set = (gene_expression_bivalent %>% arrange(desc(npc)) %>%  slice((countup*(i-1)):(countup*i - 1)))$refseq_id
  
  violin_plot(gene_expression_bivalent, gene_list_set, paste0("Gene expression at bivalent genes in naive mESCs, NPC expression quintile ", i))
}

rm(gene_expression_bivalent, countup)

Okay it looks very similar, I will take that to mean that it’s just chance.

Metaprofiles by gene expression

The other obvious thing is to look at upregulated genes, downregulated genes, and insignificantly changed genes to look at bivalency metaprofiles there.

metagene_plot(metagene(naive, tss_range, upregulated_genes$naive_npc), title = paste0("Naive cells at genes upregulated from naive to NPC (", length(upregulated_genes$naive_npc), " genes)"))

metagene_plot(metagene(primed, tss_range, upregulated_genes$naive_npc), title = paste0("Primed cells at genes upregulated from naive to NPC (", length(upregulated_genes$naive_npc), " genes)"))

metagene_plot(metagene(npc, tss_range, upregulated_genes$naive_npc), title = paste0("NPC cells at genes upregulated from naive to NPC (", length(upregulated_genes$naive_npc), " genes)"))

metagene_plot(metagene(naive, tss_range, downregulated_genes$naive_npc), title = paste0("Naive cells at genes downregulated from naive to NPC (", length(downregulated_genes$naive_npc), " genes)"))

metagene_plot(metagene(primed, tss_range, downregulated_genes$naive_npc), title = paste0("Primed cells at genes downregulated from naive to NPC (", length(downregulated_genes$naive_npc), " genes)"))

metagene_plot(metagene(npc, tss_range, downregulated_genes$naive_npc), title = paste0("NPC cells at genes downregulated from naive to NPC (", length(downregulated_genes$naive_npc), " genes)"))

metagene_plot(metagene(naive, tss_range, nonregulated_genes$naive_npc), title = paste0("Naive cells at genes nonregulated from naive to NPC (", length(nonregulated_genes$naive_npc), " genes)"))

metagene_plot(metagene(primed, tss_range, nonregulated_genes$naive_npc), title = paste0("Primed cells at genes nonregulated from naive to NPC (", length(nonregulated_genes$naive_npc), " genes)"))

metagene_plot(metagene(npc, tss_range, nonregulated_genes$naive_npc), title = paste0("NPC cells at genes nonregulated from naive to NPC (", length(nonregulated_genes$naive_npc), " genes)"))

Minor differences in magnitude aside, there is basically no difference between the bivalency profiles of upregulated and downregulated genes.

Given the above results, however, we have to do some overlap analysis between dominance classes and the differential expression classes.

commonGenes = sleuth_table_gene$naive_npc$refseq_id[sleuth_table_gene$naive_npc$refseq_id %in% consensusGenes]

print(paste0("Proportion of bivalent genes that are DE: ", 100-100*(nonregulated_genes$naive_npc %in% bivalent_genes_25$naive %>% sum)/(bivalent_genes_25$naive %in% commonGenes %>% sum)))
## [1] "Proportion of bivalent genes that are DE: 54.7436415018167"
print(paste0("Number of bivalent genes that are DE: ", (bivalent_genes_25$naive %in% commonGenes %>% sum)- (nonregulated_genes$naive_npc %in% bivalent_genes_25$naive %>% sum)))
## [1] "Number of bivalent genes that are DE: 5424"
print(paste0("Proportion of K27 dom bivalent genes that are DE: ", 100-100*(nonregulated_genes$naive_npc %in% naive_dominance$k27 %>% sum)/(naive_dominance$k27 %in% commonGenes %>% sum)))
## [1] "Proportion of K27 dom bivalent genes that are DE: 60.2026049204052"
print(paste0("Number of K27 dom bivalent genes that are DE: ", (naive_dominance$k27 %in% commonGenes %>% sum) - (nonregulated_genes$naive_npc %in% naive_dominance$k27 %>% sum)))
## [1] "Number of K27 dom bivalent genes that are DE: 416"
print(paste0("Proportion of K4 dom bivalent genes that are DE: ", 100-100*(nonregulated_genes$naive_npc %in% naive_dominance$k4 %>% sum)/(naive_dominance$k4 %in% commonGenes %>% sum)))
## [1] "Proportion of K4 dom bivalent genes that are DE: 56.6818073153239"
print(paste0("Number of K4 dom bivalent genes that are DE: ", (naive_dominance$k4 %in% commonGenes %>% sum) - (nonregulated_genes$naive_npc %in% naive_dominance$k4 %>% sum)))
## [1] "Number of K4 dom bivalent genes that are DE: 2371"
print(paste0("Proportion of mid dom bivalent genes that are DE: ", 100-100*(nonregulated_genes$naive_npc %in% naive_dominance$mid %>% sum)/(naive_dominance$mid %in% commonGenes %>% sum)))
## [1] "Proportion of mid dom bivalent genes that are DE: 52.3837902264601"
print(paste0("Number of mid dom bivalent genes that are DE: ", (naive_dominance$mid %in% commonGenes %>% sum) - (nonregulated_genes$naive_npc %in% naive_dominance$mid %>% sum)))
## [1] "Number of mid dom bivalent genes that are DE: 2637"
non_biv = setdiff(commonGenes, bivalent_genes_25$naive)
print(paste0("Proportion of non-bivalent genes that are DE: ", 100-100*(nonregulated_genes$naive_npc %in% non_biv %>% sum)/(non_biv %in% commonGenes %>% sum)))
## [1] "Proportion of non-bivalent genes that are DE: 47.9497404258521"
print(paste0("Number of non-bivalent genes that are DE: ", (non_biv %in% commonGenes %>% sum) - (nonregulated_genes$naive_npc %in% non_biv %>% sum)))
## [1] "Number of non-bivalent genes that are DE: 6373"
non_biv_k27 = (hmds$naive %>% filter(Gene %in% non_biv) %>% filter(logratio > 1))$Gene
print(paste0("Proportion of K27 dom non-bivalent genes that are DE: ", 100-100*(nonregulated_genes$naive_npc %in% non_biv_k27 %>% sum)/(non_biv_k27 %in% commonGenes %>% sum)))
## [1] "Proportion of K27 dom non-bivalent genes that are DE: 37.3538011695906"
print(paste0("Number of K27 dom non-bivalent genes that are DE: ", (non_biv_k27 %in% commonGenes %>% sum) - (nonregulated_genes$naive_npc %in% non_biv_k27 %>% sum)))
## [1] "Number of K27 dom non-bivalent genes that are DE: 1022"
non_biv_k4 = (hmds$naive %>% filter(Gene %in% non_biv) %>% filter(logratio < -1))$Gene
print(paste0("Proportion of K4 dom non-bivalent genes that are DE: ", 100-100*(nonregulated_genes$naive_npc %in% non_biv_k4 %>% sum)/(non_biv_k4 %in% commonGenes %>% sum)))
## [1] "Proportion of K4 dom non-bivalent genes that are DE: 54.4126416219439"
print(paste0("Number of K4 dom non-bivalent genes that are DE: ", (non_biv_k4 %in% commonGenes %>% sum) - (nonregulated_genes$naive_npc %in% non_biv_k4 %>% sum)))
## [1] "Number of K4 dom non-bivalent genes that are DE: 3650"
non_biv_mid = (hmds$naive %>% filter(Gene %in% non_biv) %>% filter(logratio < 1 & logratio > -1))$Gene
print(paste0("Proportion of mid dom non-bivalent genes that are DE: ", 100-100*(nonregulated_genes$naive_npc %in% non_biv_mid %>% sum)/(non_biv_mid %in% commonGenes %>% sum)))
## [1] "Proportion of mid dom non-bivalent genes that are DE: 44.2162724200676"
print(paste0("Proportion of mid dom non-bivalent genes that are DE: ", (non_biv_mid %in% commonGenes %>% sum) - (nonregulated_genes$naive_npc %in% non_biv_mid %>% sum)))
## [1] "Proportion of mid dom non-bivalent genes that are DE: 1701"

Gene ontology and bivalency classes

We can now move on to running gene ontology on our different bivalency classes. We will run these by Ensembl gene IDs. First order of business is to set up our analysis by preparing the data.

if (!file.exists("go_struct.RData")){
  ensembl_dominance = list()
  ensembl_dominance$k27 = (refseq_ensembl_external_go %>% filter(refseq_id %in% naive_dominance$k27) %>% filter(ensembl_gene_id %in% ensembl_with_go))$ensembl_gene_id %>% unique
  ensembl_dominance$k4 = (refseq_ensembl_external_go %>% filter(refseq_id %in% naive_dominance$k4) %>% filter(ensembl_gene_id %in% ensembl_with_go))$ensembl_gene_id %>% unique
  ensembl_dominance$mid = (refseq_ensembl_external_go %>% filter(refseq_id %in% naive_dominance$mid) %>% filter(ensembl_gene_id %in% ensembl_with_go))$ensembl_gene_id %>% unique
  
  gene_2_go = unstack((refseq_ensembl_external_go %>% filter(ensembl_gene_id %in% ensembl_with_go))[,c(4,2)])
  
  genelists_ontology = list()
  genelists_ontology$k27 = factor(as.integer(ensembl_with_go %in% ensembl_dominance$k27))
  names(genelists_ontology$k27) = ensembl_with_go
  genelists_ontology$k4 = factor(as.integer(ensembl_with_go %in% ensembl_dominance$k4))
  names(genelists_ontology$k4) = ensembl_with_go
  genelists_ontology$mid = factor(as.integer(ensembl_with_go %in% ensembl_dominance$mid))
  names(genelists_ontology$mid) = ensembl_with_go
  
  godata = list()
  godata$k27 = new('topGOdata', ontology = 'BP', allGenes = genelists_ontology$k27, annot = annFUN.gene2GO, gene2GO = gene_2_go)
  godata$k4 = new('topGOdata', ontology = 'BP', allGenes = genelists_ontology$k4, annot = annFUN.gene2GO, gene2GO = gene_2_go)
  godata$mid = new('topGOdata', ontology = 'BP', allGenes = genelists_ontology$mid, annot = annFUN.gene2GO, gene2GO = gene_2_go)
  save.image("go_struct.RData")
} else {
  load("go_struct.RData")
}

Having set that up, we will now define a function so that we can run gene ontology analyses and test for significance more efficiently.

testGO = function(GOdata, algo = "classic"){
  fisher_result = runTest(GOdata, algorithm = algo, statistic = "fisher")
  allGO = usedGO(GOdata)
  all_res = GenTable(GOdata, Fisher = fisher_result, orderBy = 'Fisher', topNodes = length(allGO))
  p.adj=round(p.adjust(all_res$Fisher,method="BH"),digits = 4)
  p.adj[is.na(p.adj)] = 0
  all_res_final=cbind(all_res,p.adj)
  all_res_final=all_res_final[order(all_res_final$p.adj),]
  return(all_res_final)
}

We valswill now conduct GO analysis on the datasets described above.

go_classic_tables = list()
go_classic_tables$k27 = testGO(godata$k27)
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 7588 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## Warning in p.adjust(all_res$Fisher, method = "BH"): NAs introduced by coercion
go_classic_tables$k4 = testGO(godata$k4)
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 9892 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## Warning in p.adjust(all_res$Fisher, method = "BH"): NAs introduced by coercion
go_classic_tables$mid = testGO(godata$mid)
## 
##           -- Classic Algorithm -- 
## 
##       the algorithm is scoring 11715 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## Warning in p.adjust(all_res$Fisher, method = "BH"): NAs introduced by coercion

I don’t know what to do here – I guess pull out some of the terms that we looked at before?

go_term_pvals = data.frame(GO.ID = c("GO:0032502", "GO:0009790", "GO:0019827", "GO:0030182", "GO:0008152", "GO:0002376"))
go_term_pvals = go_term_pvals %>% inner_join(go_classic_tables$k27, by = "GO.ID") %>% select(GO.ID, Term, k27 = p.adj)
go_term_pvals = go_term_pvals %>% inner_join(go_classic_tables$k4, by = "GO.ID") %>% select(GO.ID, Term = Term.x, k27, k4 = p.adj)
go_term_pvals = go_term_pvals %>% inner_join(go_classic_tables$mid, by = "GO.ID") %>% select(GO.ID, Term = Term.x, k27, k4, mid = p.adj)

print(go_term_pvals)
##        GO.ID                             Term    k27     k4    mid
## 1 GO:0032502            developmental process 0.0000 1.0000 0.0000
## 2 GO:0009790               embryo development 0.0000 0.1520 0.0000
## 3 GO:0019827 stem cell population maintenance 0.0507 0.0019 0.5518
## 4 GO:0030182           neuron differentiation 0.0000 1.0000 0.0000
## 5 GO:0008152                metabolic process 1.0000 0.0000 0.0000
## 6 GO:0002376            immune system process 0.0462 1.0000 1.0000

Classification of genes

This is essentially asking the final question about whether bivalency is actually an example of the “histone code.” We will create a classification variable to identify whether a gene is developmentally regulated.

First we will look at naive to npc, then look at primed to npc.

Naive to NPC

hmd_diffexp_naive_npc = hmds$naive %>% inner_join(sleuth_table_gene$naive_npc, by = c("Gene" = "refseq_id")) %>% select(Gene:k27, trans, qval) %>% mutate(diffexp = ifelse(qval <= 0.05, 1, 0)) %>% mutate(diffexp = as.factor(diffexp)) %>% mutate(logk4 = log(k4+1), logk27 = log(k27+1), logtrans = log(ifelse(trans+1>1, trans+1, 1)), logk9 = log(k9+1)) %>% select(diffexp, k4, logk4, k27, logk27, k9, logk9, trans, logtrans)

naive_npc_BICs = numeric()

# Trivial Model
naive_npc_BICs["trivial"] = BIC(glm(diffexp ~ 1, data = hmd_diffexp_naive_npc, family = binomial))

# K4 only
naive_npc_BICs["k4"] = BIC(glm(diffexp ~ k4, data = hmd_diffexp_naive_npc, family = binomial))

# K27 only
naive_npc_BICs["k27"] = BIC(glm(diffexp ~ k27, data = hmd_diffexp_naive_npc, family = binomial))

# K9 only
naive_npc_BICs["k9"] = BIC(glm(diffexp ~ k9, data = hmd_diffexp_naive_npc, family = binomial))

# trans only
naive_npc_BICs["trans"] = BIC(glm(diffexp ~ trans, data = hmd_diffexp_naive_npc, family = binomial))

# K4 + K27
naive_npc_BICs["k4_k27"] = BIC(glm(diffexp ~ k4 + k27, data = hmd_diffexp_naive_npc, family = binomial))

# K4 + K27 + trans
naive_npc_BICs["k4_k27_trans"] = BIC(glm(diffexp ~ k4 + k27 + trans, data = hmd_diffexp_naive_npc, family = binomial))

# K4 + K27 + log of each
naive_npc_BICs["k4_k27_logs"] = BIC(glm(diffexp ~ k4 + k27 + logk4 + logk27, data = hmd_diffexp_naive_npc, family = binomial))

# K4 + K27 + log of each + bivalency
naive_npc_BICs["k4_k27_trans_logs"] = BIC(glm(diffexp ~ k4 + k27 + trans + logk4 + logk27 + logtrans, data = hmd_diffexp_naive_npc, family = binomial))

print(naive_npc_BICs[order(naive_npc_BICs)])
##       k4_k27_logs k4_k27_trans_logs            k4_k27      k4_k27_trans 
##          31681.10          31694.95          31755.64          31765.15 
##                k4             trans           trivial               k27 
##          31799.66          32049.27          32182.03          32190.74 
##                k9 
##          32191.98

On the whole, we find that the model with K4 and K27 with logs of both minimizes BIC by approximately 14.

Full summary of top two models:

glm_k4_k27_logs_naive_npc = glm(diffexp ~ k4 + k27 + logk4 + logk27, data = hmd_diffexp_naive_npc, family = binomial)

glm_k4_k27_logs_naive_npc_trans = glm(diffexp ~ k4 + k27 + logk4 + logk27 + trans + logtrans, data = hmd_diffexp_naive_npc, family = binomial)

# K4, K27, log K4, log K27
summary(glm_k4_k27_logs_naive_npc)
## 
## Call:
## glm(formula = diffexp ~ k4 + k27 + logk4 + logk27, family = binomial, 
##     data = hmd_diffexp_naive_npc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2189  -1.1960   0.9187   1.1283   1.6012  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.332649   0.102540  -3.244  0.00118 ** 
## k4           0.006998   0.001414   4.950 7.43e-07 ***
## k27          0.016506   0.001491  11.069  < 2e-16 ***
## logk4        0.178120   0.036182   4.923 8.53e-07 ***
## logk27      -0.313040   0.035657  -8.779  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32172  on 23211  degrees of freedom
## Residual deviance: 31631  on 23207  degrees of freedom
## AIC: 31641
## 
## Number of Fisher Scoring iterations: 4
# K4, K27, log K4, log K27, trans
summary(glm_k4_k27_logs_naive_npc_trans)
## 
## Call:
## glm(formula = diffexp ~ k4 + k27 + logk4 + logk27 + trans + logtrans, 
##     family = binomial, data = hmd_diffexp_naive_npc)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.159  -1.195   0.921   1.129   1.602  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.322804   0.107918  -2.991  0.00278 ** 
## k4           0.008392   0.001521   5.518 3.44e-08 ***
## k27          0.016661   0.001559  10.684  < 2e-16 ***
## logk4        0.103848   0.048886   2.124  0.03365 *  
## logk27      -0.320080   0.035806  -8.939  < 2e-16 ***
## trans       -0.002943   0.001436  -2.050  0.04041 *  
## logtrans     0.095624   0.038357   2.493  0.01267 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32172  on 23211  degrees of freedom
## Residual deviance: 31625  on 23205  degrees of freedom
## AIC: 31639
## 
## Number of Fisher Scoring iterations: 4

Of course, this should be done better. We will use training and testing data.

set.seed(1209)

sample = sample.int(n = nrow(hmd_diffexp_naive_npc), size = floor(.7*nrow(hmd_diffexp_naive_npc)), replace = F)

hmd_diffexp_naive_npc_train = hmd_diffexp_naive_npc[sample,]
hmd_diffexp_naive_npc_test = hmd_diffexp_naive_npc[-sample,]

glm_k4_k27_logs_naive_npc_train = glm(diffexp ~ k4 + k27 + logk4 + logk27, data = hmd_diffexp_naive_npc_train, family = binomial)
glm_k4_k27_logs_naive_npc_trans_train = glm(diffexp ~ k4 + k27 + logk4 + logk27 + trans + logtrans, data = hmd_diffexp_naive_npc_train, family = binomial)
glm_k4_k27_logs_naive_npc_k9_train = glm(diffexp ~ k4 + k27 + logk4 + logk27 + k9, data = hmd_diffexp_naive_npc_train, family = binomial)

hmd_diffexp_naive_npc_test$predicted = predict(glm_k4_k27_logs_naive_npc_train, newdata = hmd_diffexp_naive_npc_test, type = "response")
hmd_diffexp_naive_npc_test$predicted_trans = predict(glm_k4_k27_logs_naive_npc_trans_train, newdata = hmd_diffexp_naive_npc_test, type = "response")
hmd_diffexp_naive_npc_test$predicted_k9 = predict(glm_k4_k27_logs_naive_npc_k9_train, newdata = hmd_diffexp_naive_npc_test, type = "response")

plot(x = hmd_diffexp_naive_npc_test$predicted, y = hmd_diffexp_naive_npc_test$diffexp)

plot(x = hmd_diffexp_naive_npc_test$predicted_trans, y = hmd_diffexp_naive_npc_test$diffexp)

plot(x = hmd_diffexp_naive_npc_test$predicted_k9, y = hmd_diffexp_naive_npc_test$diffexp)

hmd_diffexp_naive_npc_test = hmd_diffexp_naive_npc_test %>% mutate(predicted_dir = ifelse(predicted>0.5, 1, 0), predicted_dir_trans = ifelse(predicted_trans>0.5, 1, 0), predicted_dir_k9 = ifelse(predicted_k9>0.5, 1, 0))

mean(1==hmd_diffexp_naive_npc_test$diffexp)
## [1] 0.5133544
mean(hmd_diffexp_naive_npc_test$predicted_dir==hmd_diffexp_naive_npc_test$diffexp)
## [1] 0.5628949
mean(hmd_diffexp_naive_npc_test$predicted_dir_trans==hmd_diffexp_naive_npc_test$diffexp)
## [1] 0.5630385
mean(hmd_diffexp_naive_npc_test$predicted_dir_k9==hmd_diffexp_naive_npc_test$diffexp)
## [1] 0.5640437

By the trivial model where we assume everything is differentially expressed, we find that we have an accuracy rate of 51.3%. By adding in a GLM for K4, K27, log K4, and log K27, we increase accuracy rate to 56.3%. By added in trans bivalency (and log therein) to the GLM on top of that, we only increase to accuracy rate to 56.4%, and to an even lesser degree than if we instead added in K9.

Primed to NPC

hmd_diffexp_primed_npc = hmds$primed %>% inner_join(sleuth_table_gene$primed_npc, by = c("Gene" = "refseq_id")) %>% select(Gene:k27, trans, qval) %>% mutate(diffexp = ifelse(qval <= 0.05, 1, 0)) %>% mutate(diffexp = as.factor(diffexp)) %>% mutate(logk4 = log(k4+1), logk27 = log(k27+1), logtrans = log(ifelse(trans+1>1, trans+1, 1))) %>% select(diffexp, k4, logk4, k27, logk27, k9, trans, logtrans)

primed_npc_BICs = numeric()

# Trivial Model
primed_npc_BICs["trivial"] = BIC(glm(diffexp ~ 1, data = hmd_diffexp_primed_npc, family = binomial))

# K4 only
primed_npc_BICs["k4"] = BIC(glm(diffexp ~ k4, data = hmd_diffexp_primed_npc, family = binomial))

# K27 only
primed_npc_BICs["k27"] = BIC(glm(diffexp ~ k27, data = hmd_diffexp_primed_npc, family = binomial))

# K9 only
primed_npc_BICs["k9"] = BIC(glm(diffexp ~ k9, data = hmd_diffexp_primed_npc, family = binomial))

# trans only
primed_npc_BICs["trans"] = BIC(glm(diffexp ~ trans, data = hmd_diffexp_primed_npc, family = binomial))

# K4 + K27
primed_npc_BICs["k4_k27"] = BIC(glm(diffexp ~ k4 + k27, data = hmd_diffexp_primed_npc, family = binomial))

# K4 + K27 + trans
primed_npc_BICs["k4_k27_trans"] = BIC(glm(diffexp ~ k4 + k27 + trans, data = hmd_diffexp_primed_npc, family = binomial))

# K4 + K27 + log of each
primed_npc_BICs["k4_k27_logs"] = BIC(glm(diffexp ~ k4 + k27 + logk4 + logk27, data = hmd_diffexp_primed_npc, family = binomial))

# K4 + K27 + log of each + bivalency
primed_npc_BICs["k4_k27_trans_logs"] = BIC(glm(diffexp ~ k4 + k27 + trans + logk4 + logk27 + logtrans, data = hmd_diffexp_primed_npc, family = binomial))

print(primed_npc_BICs[order(primed_npc_BICs)])
##       k4_k27_logs k4_k27_trans_logs            k4_k27      k4_k27_trans 
##          28690.06          28698.28          28743.02          28749.58 
##                k4               k27           trivial             trans 
##          28812.13          29019.73          29060.79          29061.58 
##                k9 
##          29070.02

Here, we find that the model with logs of k4 and k27 is better than that with logs and trans by just a hair.

Full summary of top two models:

glm_k4_k27_logs_primed_npc = glm(diffexp ~ k4 + k27 + logk4 + logk27, data = hmd_diffexp_primed_npc, family = binomial)

glm_k4_k27_logs_primed_npc_trans = glm(diffexp ~ k4 + k27 + logk4 + logk27 + trans + logtrans, data = hmd_diffexp_primed_npc, family = binomial)

# K4, K27, log K4, log K27
summary(glm_k4_k27_logs_primed_npc)
## 
## Call:
## glm(formula = diffexp ~ k4 + k27 + logk4 + logk27, family = binomial, 
##     data = hmd_diffexp_primed_npc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3663  -0.9210  -0.8129   1.3988   2.1072  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.8656581  0.1432863  -6.041 1.53e-09 ***
## k4           0.0022042  0.0018743   1.176    0.240    
## k27          0.0003037  0.0008578   0.354    0.723    
## logk4        0.3330608  0.0425892   7.820 5.27e-15 ***
## logk27      -0.2551358  0.0475445  -5.366 8.04e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29051  on 23072  degrees of freedom
## Residual deviance: 28640  on 23068  degrees of freedom
## AIC: 28650
## 
## Number of Fisher Scoring iterations: 4
# K4, K27, log K4, log K27, trans
summary(glm_k4_k27_logs_primed_npc_trans)
## 
## Call:
## glm(formula = diffexp ~ k4 + k27 + logk4 + logk27 + trans + logtrans, 
##     family = binomial, data = hmd_diffexp_primed_npc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4021  -0.9205  -0.8099   1.3969   2.1241  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.8568215  0.1432869  -5.980 2.23e-09 ***
## k4           0.0016736  0.0018915   0.885    0.376    
## k27          0.0009991  0.0009796   1.020    0.308    
## logk4        0.3581254  0.0432885   8.273  < 2e-16 ***
## logk27      -0.2450827  0.0505636  -4.847 1.25e-06 ***
## trans       -0.0005846  0.0005923  -0.987    0.324    
## logtrans    -0.0325684  0.0233314  -1.396    0.163    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29051  on 23072  degrees of freedom
## Residual deviance: 28628  on 23066  degrees of freedom
## AIC: 28642
## 
## Number of Fisher Scoring iterations: 4

Of course, this should be done better. We will use training and testing data.

set.seed(1209)

sample = sample.int(n = nrow(hmd_diffexp_primed_npc), size = floor(.7*nrow(hmd_diffexp_primed_npc)), replace = F)

hmd_diffexp_primed_npc_train = hmd_diffexp_primed_npc[sample,]
hmd_diffexp_primed_npc_test = hmd_diffexp_primed_npc[-sample,]

glm_k4_k27_logs_primed_npc_train = glm(diffexp ~ k4 + k27 + logk4 + logk27, data = hmd_diffexp_primed_npc_train, family = binomial)
glm_k4_k27_logs_primed_npc_trans_train = glm(diffexp ~ k4 + k27 + logk4 + logk27 + trans + logtrans, data = hmd_diffexp_primed_npc_train, family = binomial)
glm_k4_k27_logs_primed_npc_k9_train = glm(diffexp ~ k4 + k27 + logk4 + logk27 + k9, data = hmd_diffexp_primed_npc_train, family = binomial)

hmd_diffexp_primed_npc_test$predicted = predict(glm_k4_k27_logs_primed_npc_train, newdata = hmd_diffexp_primed_npc_test, type = "response")
hmd_diffexp_primed_npc_test$predicted_trans = predict(glm_k4_k27_logs_primed_npc_trans_train, newdata = hmd_diffexp_primed_npc_test, type = "response")
hmd_diffexp_primed_npc_test$predicted_k9 = predict(glm_k4_k27_logs_primed_npc_k9_train, newdata = hmd_diffexp_primed_npc_test, type = "response")

plot(x = hmd_diffexp_primed_npc_test$predicted, y = hmd_diffexp_primed_npc_test$diffexp)

plot(x = hmd_diffexp_primed_npc_test$predicted_trans, y = hmd_diffexp_primed_npc_test$diffexp)

plot(x = hmd_diffexp_primed_npc_test$predicted_k9, y = hmd_diffexp_primed_npc_test$diffexp)

hmd_diffexp_primed_npc_test = hmd_diffexp_primed_npc_test %>% mutate(predicted_dir = ifelse(predicted>0.5, 1, 0), predicted_dir_trans = ifelse(predicted_trans>0.5, 1, 0), predicted_dir_k9 = ifelse(predicted_k9>0.5, 1, 0))

mean(0==hmd_diffexp_primed_npc_test$diffexp)
## [1] 0.668015
mean(hmd_diffexp_primed_npc_test$predicted_dir==hmd_diffexp_primed_npc_test$diffexp)
## [1] 0.6672927
mean(hmd_diffexp_primed_npc_test$predicted_dir_trans==hmd_diffexp_primed_npc_test$diffexp)
## [1] 0.6674372
mean(hmd_diffexp_primed_npc_test$predicted_dir_k9==hmd_diffexp_primed_npc_test$diffexp)
## [1] 0.6668593

By the trivial model where we assume nothing is differentially expressed, we have a 66.8% accuracy rate. This rate is 66.73% with the model without trans, and 66.76% with the model with trans. Adding in k9 instead of trans and logtrans here brings the accuracy rate down to 66.69%.

Conclusion: bivalency does not provide us with more information about developmental staging of genes than do K4 and K27 alone.

Fisher Hypergeometric Tests for Overlap

In this section, we will look for overlap with existing datasets.

First, we will look at the Mikkelsen, Xiao, and Mas bivalent genes as defined by ouranalysis et al., with overlap from -2.5kb to the end of the gene.

blanco = list()
blanco$mikkelsen = read.table("full_genes/Bivalent_refseq_Mikkelsen.bed", stringsAsFactors = F, header = F)$V4
blanco$mas = read.table("full_genes/Bivalent_refseq_Mas.bed", stringsAsFactors = F, header = F)$V4
blanco$xiao = read.table("full_genes/Bivalent_refseq_Xiao.bed", stringsAsFactors = F, header = F)$V4

blanco_fisher = list()

for (i in names(blanco))
{
  in_both = bivalent_genes_25$naive %in% blanco[[i]] %>% sum
  only_naive = setdiff(bivalent_genes_25$naive[bivalent_genes_25$naive %in% consensusGenes], blanco[[i]]) %>% length
  only_blanco = setdiff(blanco[[i]], bivalent_genes_25$naive[bivalent_genes_25$naive %in% consensusGenes]) %>% length
  in_neither = length(consensusGenes) - only_naive - only_blanco + in_both
  print(i)
  print(matrix(c(in_both, only_naive, only_blanco, in_neither), nr = 2))
  blanco_fisher[[i]] = fisher.test(matrix(c(in_both, only_naive, only_blanco, in_neither), nr = 2), alternative = "greater")
  print(phyper(in_both, length(bivalent_genes_25$naive[bivalent_genes_25$naive %in% consensusGenes]), length(setdiff(consensusGenes, bivalent_genes_25$naive)), length(blanco[[i]]), lower.tail = FALSE))
  
  metagene_plot(metagene(naive, tss_range, blanco[[i]]), title = paste0("Naive cells at bivalent genes from blanco analysis of ", i))
  metagene_plot(metagene(primed, tss_range, blanco[[i]]), title = paste0("Primed cells at bivalent genes from blanco analysis of ", i))
  metagene_plot(metagene(npc, tss_range, blanco[[i]]), title = paste0("NPC cells at bivalent genes from blanco analysis of ", i))
  
  naive_not_i = setdiff(bivalent_genes_25$naive[bivalent_genes_25$naive %in% consensusGenes], blanco[[i]])
  metagene_plot(metagene(naive, tss_range, naive_not_i), title = paste0("Naive cells at naive bivalent genes NOT from blanco analysis of ", i))
  metagene_plot(metagene(primed, tss_range, naive_not_i), title = paste0("Primed cells at naive bivalent genes NOT from blanco analysis of ", i))
  metagene_plot(metagene(npc, tss_range, naive_not_i), title = paste0("NPC cells at naive bivalent genes NOT from blanco analysis of ", i))
  
  rm(in_both, in_neither, only_blanco, only_naive, naive_not_i)
}
## [1] "mikkelsen"
##       [,1]  [,2]
## [1,]  1390   484
## [2,] 12474 31054
## [1] 7.898247e-314

## [1] "mas"
##      [,1]  [,2]
## [1,] 6852  4222
## [2,] 7012 38240
## [1] 0

## [1] "xiao"
##       [,1]  [,2]
## [1,]  3674  1382
## [2,] 10190 34724
## [1] 0

Overwhelmingly, it seems clear that everyone else only looked at the most strongly H3K27me3-dominant genes by the ouranalysis reanalysis. Now we will look using our metric, where we only consider it if there is both H3K4me3 and H3K27me3 peak overlapping the +1/+2 nucleosomes.

ouranalysis = list()
ouranalysis$mikkelsen = read.table("refseq_promoter/Bivalent_refseq_promoter_Mikkelsen.tab", stringsAsFactors = F, header = F)$V4
ouranalysis$mas = read.table("refseq_promoter/Bivalent_refseq_promoter_Mas.tab", stringsAsFactors = F, header = F)$V4
ouranalysis$xiao = read.table("refseq_promoter/Bivalent_refseq_promoter_Xiao.tab", stringsAsFactors = F, header = F)$V4

ouranalysis_fisher = list()

for (i in names(ouranalysis))
{
  in_both = bivalent_genes_25$naive %in% ouranalysis[[i]] %>% sum
  only_naive = setdiff(bivalent_genes_25$naive[bivalent_genes_25$naive %in% consensusGenes], ouranalysis[[i]]) %>% length
  only_ouranalysis = setdiff(ouranalysis[[i]], bivalent_genes_25$naive[bivalent_genes_25$naive %in% consensusGenes]) %>% length
  in_neither = length(consensusGenes) - only_naive - only_ouranalysis + in_both
  print(i)
  print(matrix(c(in_both, only_naive, only_ouranalysis, in_neither), nr = 2))
  ouranalysis_fisher[[i]] = fisher.test(matrix(c(in_both, only_naive, only_ouranalysis, in_neither), nr = 2), alternative = "greater")
  print(phyper(in_both, length(bivalent_genes_25$naive[bivalent_genes_25$naive %in% consensusGenes]), length(setdiff(consensusGenes, bivalent_genes_25$naive)), length(ouranalysis[[i]]), lower.tail = FALSE))
  
  metagene_plot(metagene(naive, tss_range, ouranalysis[[i]]), title = paste0("Naive cells at bivalent genes from our analysis of ", i))
  metagene_plot(metagene(primed, tss_range, ouranalysis[[i]]), title = paste0("Primed cells at bivalent genes from our analysis of ", i))
  metagene_plot(metagene(npc, tss_range, ouranalysis[[i]]), title = paste0("NPC cells at bivalent genes from our analysis of ", i))
  
  naive_not_i = setdiff(bivalent_genes_25$naive[bivalent_genes_25$naive %in% consensusGenes], ouranalysis[[i]])
  metagene_plot(metagene(naive, tss_range, naive_not_i), title = paste0("Naive cells at naive bivalent genes NOT from our analysis of ", i))
  metagene_plot(metagene(primed, tss_range, naive_not_i), title = paste0("Primed cells at naive bivalent genes NOT from our analysis of ", i))
  metagene_plot(metagene(npc, tss_range, naive_not_i), title = paste0("NPC cells at naive bivalent genes NOT from our analysis of ", i))
  
  rm(in_both, in_neither, only_ouranalysis, only_naive, naive_not_i)
}
## [1] "mikkelsen"
##       [,1]  [,2]
## [1,]  1136   199
## [2,] 12728 30831
## [1] 0

## [1] "mas"
##      [,1]  [,2]
## [1,] 4928  1942
## [2,] 8936 36672
## [1] 0

## [1] "xiao"
##       [,1]  [,2]
## [1,]  2914   732
## [2,] 10950 33854
## [1] 0

Again, they’re just looking at the H3K27me3-dominated class of genes.

Receiver Operator Characteristic Curves

In this section, we will use our diffexp data frames to create receiver operator characteristic curves. As usual, we will first create functions; these will output the area under the curve.

library(wesanderson)
## Warning: package 'wesanderson' was built under R version 4.0.5
pal = wes_palette("Zissou1", type = "continuous")

simple_roc = function(labels, hmds, title_text = NA, rev = F, legend_overlap = T, title_overlap = T)
{
  df = data.frame(labs = labels, hmd = hmds)
  roc_df = df %>% mutate(hmd = ifelse(hmd<0, 0, hmd)) %>% mutate(hmd = ifelse(hmd>100, 100, hmd)) %>% group_by(hmd) %>% summarize(tp = sum(labs == 1-rev), fp= sum(labs == 0+rev)) %>% ungroup %>% arrange(desc(hmd)) %>% mutate(tps = cumsum(tp), fps = cumsum(fp))
  roc_df$tpr = roc_df$tps/sum(roc_df$tp)
  roc_df$fpr = roc_df$fps/sum(roc_df$fp)
  
  roc_plt = ggplot(data = roc_df, aes(x = fpr, y = tpr, color = hmd)) + geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), size = 0.5, color = "black") + geom_line(size=1) + scale_color_gradientn(colors = pal, limits = c(0, 100)) + theme_classic() + scale_x_continuous(name = "False Positive Rate", breaks = seq(0, 1, 0.25)) + scale_y_continuous(name = "True Positive Rate", breaks = seq(0, 1, 0.25)) + theme(text = element_text(size = 10, color = "black"), axis.text = element_text(size = 10, color = "black"), axis.title = element_text(size = 10, color = "black"), title = element_text(size = 10, color = "black"))
  
  if (!legend_overlap){ roc_plt = roc_plt + theme(legend.position = 'none') }
  if (!is.na(title_text) & title_overlap){ roc_plt = roc_plt + ggtitle(title_text) }
  print(roc_plt)
  
  auc_df = roc_df %>% tail(-1)
  auc_df$fpr_diff = auc_df$fpr - head(roc_df, -1)$fpr
  auc_df$area_diff = auc_df$fpr_diff*auc_df$tpr
  
  return(sum(auc_df$area_diff))
}

simple_roc_notrim = function(labels, hmds, title_text = NA, rev = F, legend_overlap = T, title_overlap = T)
{
  df = data.frame(labs = labels, hmd = hmds)
  roc_df = df %>% group_by(hmd) %>% summarize(tp = sum(labs == 1-rev), fp= sum(labs == 0+rev)) %>% ungroup %>% arrange(desc(hmd)) %>% mutate(tps = cumsum(tp), fps = cumsum(fp))
  roc_df$tpr = roc_df$tps/sum(roc_df$tp)
  roc_df$fpr = roc_df$fps/sum(roc_df$fp)
  
  roc_plt = ggplot(data = roc_df, aes(x = fpr, y = tpr, color = hmd)) + geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), size = 0.5, color = "black") + geom_line(size=1) + scale_color_gradientn(colors = pal) + theme_classic() + scale_x_continuous(name = "False Positive Rate", breaks = seq(0, 1, 0.25)) + scale_y_continuous(name = "True Positive Rate", breaks = seq(0, 1, 0.25)) + theme(text = element_text(size = 10, color = "black"), axis.text = element_text(size = 10, color = "black"), axis.title = element_text(size = 10, color = "black"), title = element_text(size = 10, color = "black"))
  
  if (!legend_overlap){ roc_plt = roc_plt + theme(legend.position = 'none') }
  if (!is.na(title_text) & title_overlap){ roc_plt = roc_plt + ggtitle(title_text) }
  print(roc_plt)
  print(roc_plt)
  
  auc_df = roc_df %>% tail(-1)
  auc_df$fpr_diff = auc_df$fpr - head(roc_df, -1)$fpr
  auc_df$area_diff = auc_df$fpr_diff*auc_df$tpr
  
  return(sum(auc_df$area_diff))
}

We now want to leave receiver operator characteristic curves for all these datasets.

simple_roc(hmd_diffexp_naive_npc$diffexp, hmd_diffexp_naive_npc$k4, title_text = "ROC Curve of Differential Expression from Naive to NPC by H3K4me3 Threshold", legend_overlap = F, title_overlap = F)
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5738976
simple_roc(hmd_diffexp_naive_npc$diffexp, hmd_diffexp_naive_npc$k9, title_text = "ROC Curve of Differential Expression from Naive to NPC by H3K9me3 Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5014713
simple_roc(hmd_diffexp_naive_npc$diffexp, hmd_diffexp_naive_npc$k27, rev = T, title_text = "ROC Curve of Differential Expression from Naive to NPC by H3K27me3 Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5124343
simple_roc_notrim(hmd_diffexp_naive_npc$diffexp, hmd_diffexp_naive_npc$logk27 - hmd_diffexp_naive_npc$logk4, rev = T, title_text = "ROC Curve of Differential Expression from Naive to NPC by Ln K27/K4 Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5567235
simple_roc(hmd_diffexp_naive_npc$diffexp, hmd_diffexp_naive_npc$trans, title_text = "ROC Curve of Differential Expression from Naive to NPC by Bivalency Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5566414
simple_roc(hmd_diffexp_primed_npc$diffexp, hmd_diffexp_primed_npc$k4, title_text = "ROC Curve of Differential Expression from Primed to NPC by H3K4me3 Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5723689
simple_roc(hmd_diffexp_primed_npc$diffexp, hmd_diffexp_primed_npc$k9, title_text = "ROC Curve of Differential Expression from Primed to NPC by H3K9me3 Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.4929852
simple_roc(hmd_diffexp_primed_npc$diffexp, hmd_diffexp_primed_npc$k27, rev = T, title_text = "ROC Curve of Differential Expression from Primed to NPC by H3K27me3 Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5235438
simple_roc_notrim(hmd_diffexp_primed_npc$diffexp, hmd_diffexp_primed_npc$logk27 - hmd_diffexp_primed_npc$logk4, rev = T, title_text = "ROC Curve of Differential Expression from Primed to NPC by Ln K27/K4 Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5778506
simple_roc(hmd_diffexp_primed_npc$diffexp, hmd_diffexp_primed_npc$trans, title_text = "ROC Curve of Differential Expression from Primed to NPC by Bivalency Threshold")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.4807126

We can also compare our modeling datasets by ROC curves to see if addition of trans bivalency as a parameter increases the accuracy of the model significantly by this metric.

simple_roc_notrim(hmd_diffexp_naive_npc_test$diffexp, hmd_diffexp_naive_npc_test$predicted, title_text = "ROC Curve of Diff Expression form Naive to NPC by the K4, K27, and logs model")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5846549
simple_roc_notrim(hmd_diffexp_naive_npc_test$diffexp, hmd_diffexp_naive_npc_test$predicted_trans, title_text = "ROC Curve of Diff Expression form Naive to NPC by the K4, K27, trans, and logs model")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5855772
simple_roc_notrim(hmd_diffexp_naive_npc_test$diffexp, hmd_diffexp_naive_npc_test$predicted_k9, title_text = "ROC Curve of Diff Expression form Naive to NPC by the K4, K27, logs, and K9 model")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5846311
simple_roc_notrim(hmd_diffexp_primed_npc_test$diffexp, hmd_diffexp_primed_npc_test$predicted, title_text = "ROC Curve of Diff Expression form Primed to NPC by the K4, K27, and logs model")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5825625
simple_roc_notrim(hmd_diffexp_primed_npc_test$diffexp, hmd_diffexp_primed_npc_test$predicted_trans, title_text = "ROC Curve of Diff Expression form Primed to NPC by the K4, K27, trans, and logs model")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5828533
simple_roc_notrim(hmd_diffexp_primed_npc_test$diffexp, hmd_diffexp_primed_npc_test$predicted_k9, title_text = "ROC Curve of Diff Expression form Primed to NPC by the K4, K27, logs, and K9 model")
## `summarise()` ungrouping output (override with `.groups` argument)

## [1] 0.5807692